{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "sys.path.append(os.path.abspath('../../../code'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from types import SimpleNamespace\n",
    "\n",
    "args = SimpleNamespace()\n",
    "args.input_file = '../../../data/exdata/thau2019/thau2019_spans_matched_to_manifesto_texts.jsonl'\n",
    "\n",
    "args.output_file = '../../../data/annotation/exdata/uk-manifestos_thau2019_annotations.jsonl'\n",
    "args.overwrite_output = False\n",
    "\n",
    "args.verbose = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "from collections import Counter\n",
    "\n",
    "import numpy as np\n",
    "np.set_printoptions(formatter={'float': lambda x: \"{0:0.3f}\".format(x)})\n",
    "import pandas as pd\n",
    "pd.set_option('display.max_columns', 15)\n",
    "pd.set_option('display.width', 320)\n",
    "\n",
    "from utils.corpus import DoccanoAnnotationsCorpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "get the label map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'O': 0,\n",
       " 'I-Age/generation': 1,\n",
       " 'B-Age/generation': 11,\n",
       " 'I-Economic class': 2,\n",
       " 'B-Economic class': 12,\n",
       " 'I-Ethnicity/race': 3,\n",
       " 'B-Ethnicity/race': 13,\n",
       " 'I-Gender': 4,\n",
       " 'B-Gender': 14,\n",
       " 'I-Geography': 5,\n",
       " 'B-Geography': 15,\n",
       " 'I-Health': 6,\n",
       " 'B-Health': 16,\n",
       " 'I-Nationality': 7,\n",
       " 'B-Nationality': 17,\n",
       " 'I-Religion': 8,\n",
       " 'B-Religion': 18,\n",
       " 'I-Other': 9,\n",
       " 'B-Other': 19,\n",
       " 'I-none': 10,\n",
       " 'B-none': 20}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "types = [\n",
    "    'Age/generation',\n",
    "    'Economic class',\n",
    "    'Ethnicity/race',\n",
    "    'Gender',\n",
    "    'Geography',\n",
    "    'Health',\n",
    "    'Nationality',\n",
    "    'Religion',\n",
    "    'Other',\n",
    "    'none',\n",
    "]\n",
    "\n",
    "cat2code = {'O': 0}\n",
    "for i, l in enumerate(types):\n",
    "    cat2code['I-'+l] = int(i+1)\n",
    "    cat2code['B-'+l] = int(i+1+len(types))\n",
    "cat2code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: annotation of token 42 in document conservatives-1974-02-191-2 disambiguated\n"
     ]
    }
   ],
   "source": [
    "# read first (we merge the rest to this one)\n",
    "acorp = DoccanoAnnotationsCorpus(cat2code)\n",
    "acorp.load_from_jsonlines(fp=args.input_file, annotator_id='thau2019', verbose=args.verbose)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(['the', 'oldest'], array([11,  1]))"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# inspect those where warnings raised\n",
    "tmp = acorp.docs[acorp.doc_id2idx['conservatives-1974-02-191-2']]\n",
    "mask = tmp.annotations['thau2019'] > 0\n",
    "[tmp.tokens[idx] for idx in np.where(mask)[0]], tmp.annotations['thau2019'][mask]\n",
    "# looks fine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No. docs: 4070\n",
      "(array([1]), array([4070]))\n"
     ]
    }
   ],
   "source": [
    "print('No. docs:', acorp.ndocs)\n",
    "# how many singly/multiply annotated?\n",
    "print(np.unique(np.asarray([doc.n_annotations for doc in acorp.docs]), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# identify duplicate texts (if any)\n",
    "texts = Counter()\n",
    "for doc in acorp.docs:\n",
    "    texts.update([doc.text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1, 2]), array([4026,   22]))\n"
     ]
    }
   ],
   "source": [
    "print(np.unique(np.asarray(list(texts.values())), return_counts=True))\n",
    "# 22 sentences are verbatim duplicates (possible because we sampled based on within-manifesto sentence IDs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get IDs of documents with dublicated text\n",
    "duplicated = [t for t, n in texts.most_common() if n > 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# map doc IDs to texts\n",
    "duplicates_ids = dict()\n",
    "for doc in acorp.docs:\n",
    "    if doc.text in duplicated:\n",
    "        if doc.text in duplicates_ids.keys():\n",
    "            duplicates_ids[doc.text].append(doc.id)\n",
    "        else:\n",
    "            duplicates_ids[doc.text] = [doc.id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mconservatives-1966-141-1\u001b[0m\n",
      "'legislate to allow ground leaseholders to buy or rent their houses on fair terms except where the property is to be redeveloped .'\n",
      "                    \u001b[44m      \u001b[49m \u001b[44m            \u001b[49m                                                                                           \t(thau2019)\n",
      "\u001b[1mconservatives-1966-188-1\u001b[0m\n",
      "'legislate to allow ground leaseholders to buy or rent their houses on fair terms except where the property is to be redeveloped .'\n",
      "                    \u001b[44m      \u001b[49m \u001b[44m            \u001b[49m                                                                                           \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mconservatives-1970-209-1\u001b[0m\n",
      "'we are publishing separate manifestos for scotland and wales .'\n",
      "                                                        \u001b[44m     \u001b[49m  \t(thau2019)\n",
      "\u001b[1mconservatives-1974-02-288-1\u001b[0m\n",
      "'we are publishing separate manifestos for scotland and wales .'\n",
      "                                           \u001b[44m        \u001b[49m     \u001b[44m     \u001b[49m  \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mconservatives-1974-02-65-1\u001b[0m\n",
      "'it is no part of our policy to see the wives and children of men on strike suffering .'\n",
      "                                        \u001b[44m     \u001b[49m \u001b[44m   \u001b[49m \u001b[44m        \u001b[49m                             \t(thau2019)\n",
      "\u001b[1mconservatives-1974-02-100-2\u001b[0m\n",
      "'it is no part of our policy to see the wives and children of men on strike suffering .'\n",
      "                                        \u001b[44m     \u001b[49m \u001b[44m   \u001b[49m \u001b[44m        \u001b[49m                             \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2005-553-1\u001b[0m\n",
      "'we are proud to have brought in the human rights act , enabling british citizens to take action in british courts rather than having to wait years to seek redress in strasbourg .'\n",
      "                                                                 \u001b[44m       \u001b[49m \u001b[44m        \u001b[49m                                                                                                  \t(thau2019)\n",
      "\u001b[1mlabour-2010-533-2\u001b[0m\n",
      "'we are proud to have brought in the human rights act , enabling british citizens to take action in british courts rather than having to wait years to seek redress in strasbourg .'\n",
      "                                                                 \u001b[44m       \u001b[49m \u001b[44m        \u001b[49m                                                                                                  \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mconservatives-2010-40-2\u001b[0m\n",
      "'we will increase the private sector ' s share of the economy in all regions of the country , especially outside london and the south east .'\n",
      "                                                                 \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m \u001b[44m  \u001b[49m \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m                                 \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m    \u001b[49m  \t(thau2019)\n",
      "\u001b[1mconservatives-2010-149-1\u001b[0m\n",
      "'we will increase the private sector ' s share of the economy in all regions of the country , especially outside london and the south east .'\n",
      "                                                                 \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m \u001b[44m  \u001b[49m \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m                                 \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m    \u001b[49m  \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-311-1\u001b[0m\n",
      "'control immigration through our australian-style points-based system , ensuring that as growth returns we see rising levels of employment and wages , not rising immigration , and requiring newcomers to earn citizenship and the entitlements it brings .'\n",
      "                                                                                                                                                                                              \u001b[44m         \u001b[49m                                                     \t(thau2019)\n",
      "\u001b[1mlabour-2010-651-1\u001b[0m\n",
      "'control immigration through our australian-style points-based system , ensuring that as growth returns we see rising levels of employment and wages , not rising immigration , and requiring newcomers to earn citizenship and the entitlements it brings .'\n",
      "                                                                                                                                                                                              \u001b[44m         \u001b[49m                                                     \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-568-1\u001b[0m\n",
      "'conduct a strategic defence review to equip our armed forces for 21st century challenges , and support our troops and veterans .'\n",
      "                                                                                                        \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m               \t(thau2019)\n",
      "\u001b[1mlabour-2010-673-1\u001b[0m\n",
      "'conduct a strategic defence review to equip our armed forces for 21st century challenges , and support our troops and veterans .'\n",
      "                                                                                                        \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m               \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-196-1\u001b[0m\n",
      "'every young person guaranteed education or training until 18 , with 75 per cent going on to higher education , or completing an advanced apprenticeship or technician level training , by the age of 30 .'\n",
      "       \u001b[44m     \u001b[49m \u001b[44m      \u001b[49m                                                                                                                                                                                       \t(thau2019)\n",
      "\u001b[1mlabour-2010-641-1\u001b[0m\n",
      "'every young person guaranteed education or training until 18 , with 75 per cent going on to higher education , or completing an advanced apprenticeship or technician level training , by the age of 30 .'\n",
      "       \u001b[44m     \u001b[49m \u001b[44m      \u001b[49m                                                                                                                                                                                       \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-364-1\u001b[0m\n",
      "'the right to request flexible working for older workers , with an end to default retirement at 65 , enabling more people to decide for themselves how long they choose to keep working .'\n",
      "                                           \u001b[44m     \u001b[49m \u001b[44m       \u001b[49m                                                                                                                                 \t(thau2019)\n",
      "\u001b[1mlabour-2010-654-1\u001b[0m\n",
      "'the right to request flexible working for older workers , with an end to default retirement at 65 , enabling more people to decide for themselves how long they choose to keep working .'\n",
      "                                           \u001b[44m     \u001b[49m \u001b[44m       \u001b[49m                                                                                                                                 \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-468-1\u001b[0m\n",
      "'make greener living easier and fairer through ' pay as you save ' home energy insulation , energy-bill discounts for pensioners and requiring landlords to properly insulate rented homes .'\n",
      "                                                                                                                                               \u001b[44m         \u001b[49m                                    \t(thau2019)\n",
      "\u001b[1mlabour-2010-663-1\u001b[0m\n",
      "'make greener living easier and fairer through ' pay as you save ' home energy insulation , energy-bill discounts for pensioners and requiring landlords to properly insulate rented homes .'\n",
      "                                                                                                                                               \u001b[44m         \u001b[49m                                    \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-409-1\u001b[0m\n",
      "'registered supporters trusts enabled to buy stakes in their club bringing mutualism to the heart of football .'\n",
      " \u001b[44m          \u001b[49m \u001b[44m          \u001b[49m \u001b[44m      \u001b[49m                                                                                  \t(thau2019)\n",
      "\u001b[1mlabour-2010-658-1\u001b[0m\n",
      "'registered supporters trusts enabled to buy stakes in their club bringing mutualism to the heart of football .'\n",
      " \u001b[44m          \u001b[49m \u001b[44m          \u001b[49m \u001b[44m      \u001b[49m                                                                                  \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-520-1\u001b[0m\n",
      "'a statutory register of lobbyists , with mps banned from working for lobbying companies and required to seek approval for paid outside appointments .'\n",
      "                         \u001b[44m         \u001b[49m                                    \u001b[44m        \u001b[49m \u001b[44m         \u001b[49m                                                              \t(thau2019)\n",
      "\u001b[1mlabour-2010-671-1\u001b[0m\n",
      "'a statutory register of lobbyists , with mps banned from working for lobbying companies and required to seek approval for paid outside appointments .'\n",
      "                         \u001b[44m         \u001b[49m                                    \u001b[44m        \u001b[49m \u001b[44m         \u001b[49m                                                              \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-655-1\u001b[0m\n",
      "'a new national care service to ensure free care in the home for those with the greatest care needs and a cap on the costs of residential care so that everyone ' s homes and savings are protected from care charges after two years in a care home .'\n",
      "                                                                 \u001b[44m     \u001b[49m \u001b[44m    \u001b[49m \u001b[44m   \u001b[49m \u001b[44m        \u001b[49m \u001b[44m    \u001b[49m \u001b[44m     \u001b[49m                                                    \u001b[44m        \u001b[49m                                                                                       \t(thau2019)\n",
      "\u001b[1mlabour-2010-365-1\u001b[0m\n",
      "'a new national care service to ensure free care in the home for those with the greatest care needs and a cap on the costs of residential care so that everyone ' s homes and savings are protected from care charges after two years in a care home .'\n",
      "                                                                 \u001b[44m     \u001b[49m \u001b[44m    \u001b[49m \u001b[44m   \u001b[49m \u001b[44m        \u001b[49m \u001b[44m    \u001b[49m \u001b[44m     \u001b[49m                                                                                                                                                   \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-193-1\u001b[0m\n",
      "'an expansion of free nursery places for two year olds and 15 hours a week of flexible , free nursery education for three and four year olds .'\n",
      "                                         \u001b[44m   \u001b[49m \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m                                                              \u001b[44m     \u001b[49m \u001b[44m   \u001b[49m \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m  \t(thau2019)\n",
      "\u001b[1mlabour-2010-638-1\u001b[0m\n",
      "'an expansion of free nursery places for two year olds and 15 hours a week of flexible , free nursery education for three and four year olds .'\n",
      "                                         \u001b[44m   \u001b[49m \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m                                                              \u001b[44m     \u001b[49m \u001b[44m   \u001b[49m \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m  \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-653-1\u001b[0m\n",
      "'a new toddler tax credit of £4 a week from 2012 to give more support to all parents of young children - whether they want to stay at home or work .'\n",
      "                                                                         \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m \u001b[44m  \u001b[49m \u001b[44m     \u001b[49m \u001b[44m        \u001b[49m                                              \t(thau2019)\n",
      "\u001b[1mlabour-2010-363-1\u001b[0m\n",
      "'a new toddler tax credit of £4 a week from 2012 to give more support to all parents of young children - whether they want to stay at home or work .'\n",
      "                                                                         \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m \u001b[44m  \u001b[49m \u001b[44m     \u001b[49m \u001b[44m        \u001b[49m                                              \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-666-1\u001b[0m\n",
      "'ensure fairness for food producers through eu reform and a supermarkets ombudsman ; and support post offices , shops and pubs in rural communities .'\n",
      "                     \u001b[44m    \u001b[49m \u001b[44m         \u001b[49m                                                                                                                  \t(thau2019)\n",
      "\u001b[1mlabour-2010-471-1\u001b[0m\n",
      "'ensure fairness for food producers through eu reform and a supermarkets ombudsman ; and support post offices , shops and pubs in rural communities .'\n",
      "                     \u001b[44m    \u001b[49m \u001b[44m         \u001b[49m                                                                                                                  \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-194-1\u001b[0m\n",
      "'every pupil leaving primary school secure in the basics , with a 3rs guarantee of one-to-one and small-group tuition for every child falling behind ; and in secondary school , every pupil with a personal tutor and a choice of good qualifications .'\n",
      " \u001b[44m     \u001b[49m \u001b[44m     \u001b[49m \u001b[44m       \u001b[49m \u001b[44m       \u001b[49m \u001b[44m      \u001b[49m                                                                                       \u001b[44m     \u001b[49m \u001b[44m     \u001b[49m \u001b[44m       \u001b[49m \u001b[44m      \u001b[49m                                                                                                    \t(thau2019)\n",
      "\u001b[1mlabour-2010-639-1\u001b[0m\n",
      "'every pupil leaving primary school secure in the basics , with a 3rs guarantee of one-to-one and small-group tuition for every child falling behind ; and in secondary school , every pupil with a personal tutor and a choice of good qualifications .'\n",
      " \u001b[44m     \u001b[49m \u001b[44m     \u001b[49m \u001b[44m       \u001b[49m \u001b[44m       \u001b[49m \u001b[44m      \u001b[49m                                                                                       \u001b[44m     \u001b[49m \u001b[44m     \u001b[49m \u001b[44m       \u001b[49m \u001b[44m      \u001b[49m                                                                                                    \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-258-1\u001b[0m\n",
      "'more personal care , with the right in law to choose from any provider who meets nhs standards of quality at nhs costs when booking a hospital appointment , one-to-one dedicated nursing for all cancer patients , and more care at home .'\n",
      "                                                                                                                                                                                               \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m        \u001b[49m                          \t(thau2019)\n",
      "\u001b[1mlabour-2010-644-1\u001b[0m\n",
      "'more personal care , with the right in law to choose from any provider who meets nhs standards of quality at nhs costs when booking a hospital appointment , one-to-one dedicated nursing for all cancer patients , and more care at home .'\n",
      "                                                                                                                                                                                               \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m        \u001b[49m                          \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-309-1\u001b[0m\n",
      "'guarantee fast and effective action to deal with anti-social behaviour , including a right to legal injunctions for repeat victims , funded by the police or council who let them down .'\n",
      "                                                                                                                     \u001b[44m      \u001b[49m \u001b[44m       \u001b[49m                                                      \t(thau2019)\n",
      "\u001b[1mlabour-2010-649-1\u001b[0m\n",
      "'guarantee fast and effective action to deal with anti-social behaviour , including a right to legal injunctions for repeat victims , funded by the police or council who let them down .'\n",
      "                                                                                                                     \u001b[44m      \u001b[49m \u001b[44m       \u001b[49m                                                      \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-260-1\u001b[0m\n",
      "'access to psychological therapy for those who need it .'\n",
      "                                     \u001b[44m     \u001b[49m \u001b[44m   \u001b[49m \u001b[44m    \u001b[49m \u001b[44m  \u001b[49m  \t(thau2019)\n",
      "\u001b[1mlabour-2010-646-1\u001b[0m\n",
      "'access to psychological therapy for those who need it .'\n",
      "                                     \u001b[44m     \u001b[49m \u001b[44m   \u001b[49m \u001b[44m    \u001b[49m \u001b[44m  \u001b[49m  \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-648-1\u001b[0m\n",
      "'intervene earlier to prevent crime , with no-nonsense action to tackle the problems caused by 50 , 000 dysfunctional families .'\n",
      "                                                                                                        \u001b[44m             \u001b[49m \u001b[44m        \u001b[49m  \t(thau2019)\n",
      "\u001b[1mlabour-2010-308-1\u001b[0m\n",
      "'intervene earlier to prevent crime , with no-nonsense action to tackle the problems caused by 50 , 000 dysfunctional families .'\n",
      "                                                                                                        \u001b[44m             \u001b[49m \u001b[44m        \u001b[49m  \t(thau2019)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mlabour-2010-645-1\u001b[0m\n",
      "'the right to choose a gp in your area open at evenings and weekends , with more services available on the high-street , personal care plans and rights to individual budgets .'\n",
      "                             \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m                                                                                                                                         \t(thau2019)\n",
      "\u001b[1mlabour-2010-259-1\u001b[0m\n",
      "'the right to choose a gp in your area open at evenings and weekends , with more services available on the high-street , personal care plans and rights to individual budgets .'\n",
      "                             \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m                                                                                                                                         \t(thau2019)\n"
     ]
    }
   ],
   "source": [
    "# print\n",
    "if args.verbose:\n",
    "    for ids in duplicates_ids.values():\n",
    "        print('\\n', '-'*100, sep='')\n",
    "        for id in ids:\n",
    "            print(acorp.docs[acorp.doc_id2idx[id]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# note: I've manually checked the cases where these very duplicate annotations.\n",
    "#  In most cases, the annotations from the same annotator for the same text (though diff. 'docs') are identical.>\n",
    "#  But in the few cases where this does not hold, I manually disambiguate.\n",
    "disambigute_duplicates = {\n",
    "    'conservatives-1974-02-288-1': ['conservatives-1970-209-1', 'conservatives-1974-02-288-1']\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# resolve duplicates: for duplicated texts\n",
    "for ids in duplicates_ids.values():\n",
    "    # see in all but the first doc (the 'original')\n",
    "    if all([id in disambigute_duplicates.values() for id in ids]):\n",
    "        this = [this for this, pair in disambigute_duplicates.items() if all([_ in pair for _ in ids])]\n",
    "        for _ in disambigute_duplicates[this[0]]:\n",
    "            if _ not in this:\n",
    "                acorp.remove_documents([_])\n",
    "    for id in ids[1:]:\n",
    "        # id = ids[1]\n",
    "        # for each annotator\n",
    "        for annotator in acorp.docs[acorp.doc_id2idx[id]].annotators:\n",
    "            # whether the annotator already in the 'original'\n",
    "            if annotator in acorp.docs[acorp.doc_id2idx[ids[0]]].annotators:\n",
    "                # if so remove annotation\n",
    "                acorp.docs[acorp.doc_id2idx[id]].remove_annotation(annotator)\n",
    "        if acorp.docs[acorp.doc_id2idx[id]].n_annotations > 0:\n",
    "            acorp.merge_annotations([ids[0], id])\n",
    "        else:\n",
    "            acorp.remove_documents([id])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1]), array([4048]))\n"
     ]
    }
   ],
   "source": [
    "# verify\n",
    "texts = Counter()\n",
    "for doc in acorp.docs:\n",
    "    texts.update([doc.text])\n",
    "print(np.unique(np.asarray(list(texts.values())), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reset important corpus attributes\n",
    "acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}\n",
    "acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}\n",
    "acorp.annotator_label_counts = acorp._count_annotator_labels()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Write to disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists(args.output_file) or args.overwrite_output:\n",
    "    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)\n",
    "    acorp.save_as_jsonlines(args.output_file, encoding='utf-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "group_mention_detection",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
